##################################################################
##################################################################
## Replication Material
## Stefan Müller: The Temporal Focus of Campaign Communication
## The Journal of Politics
## stefan.mueller@ucd.ie
##
## Script 6: Results reported in SI Section D
##################################################################
##################################################################


# Note: The file description_replication_material_jop_mueller.pdf describes the purpose of this 
# file in detail and lists the names and sources of all datasets 
# used in this script

# This script was run on the following R version, platform and OS:
# R version 3.6.0 (2019-04-26)
# Platform: Platform: x86_64-apple-darwin15.6.0 (64-bit)
# Running under: macOS Catalima 10.15.5


# load packages required to run this script
library(texreg)     # CRAN v1.36.23
library(estimatr)   # CRAN v0.22.0
library(ggplot2)    # CRAN v3.3.2
library(ggbeeswarm) # CRAN v0.6.0
library(broom)      # CRAN v0.5.5
library(dplyr)      # CRAN v1.0.0
library(tidyr)      # CRAN v1.1.0
library(emmeans)    # CRAN v1.4.5
library(scales)     # CRAN v1.1.1

# create custom ggplot2 scheme
theme_baser <- function (){
  theme_minimal()  %+replace%
    theme(panel.grid.minor.x = element_blank(),
          panel.grid.minor.y = element_blank(),
          panel.grid.major.x = element_blank(),
          panel.grid.major.y = element_blank(),
          panel.border = element_rect(fill = NA,color = "black", size = 0.5,
                                      linetype = "solid"),
          legend.title = element_text(size = 15),
          plot.title = element_text(size = 15, face = "italic",
                                    vjust = 1.5, hjust = 0,
                                    margin=margin(0, 0, 12 ,0)),
          legend.position = "bottom",
          axis.ticks = element_line(size = 0.3),
          axis.ticks.length = unit(0.2, "cm"),
          legend.text=element_text(size = 13),
          strip.text = element_text(size = 15, hjust = 0.5,
                                    margin = margin(b = 5, r = 5, l = 5, t = 5)),
          axis.text = element_text(colour = "black", size = 13),
          axis.title = element_text(size = 13, hjust = 0.5))
}

# set theme
theme_set(theme_baser())

dat_combined_all <- readRDS("data_manifestos_classified.rds")

length(unique(dat_combined_all$manifesto_id))

# check how many long sentences and in which manifestos!

dat_combined_long <- dat_combined_all %>% 
    group_by(manifesto_id, countryname, party, partyname, date) %>% 
    summarise(mean_ntoken = mean(ntoken),
              max_ntoken = max(ntoken),
              median_ntoken = median(ntoken),
              min_tnoken = min(ntoken), 
              sum_ntoken = sum(ntoken)) %>% 
    arrange(-max_ntoken)


# remove all sentences with more than 99 tokens
dat_combined <- filter(dat_combined_all, ntoken < 100)

dat_combined$year <- as.numeric(dat_combined$year)


# calculate absolute number of words and proportions per class and manifesto
dat_combined_props <- dat_combined %>% 
    group_by(countryname, partyname, party_family, language, date, incumbency_status2_factor,
             manifesto_id, extremist_party, gdp_growth_lag, 
             rile, year, class) %>%
    summarise(n_sentences = n(),
              n_words = sum(ntoken)) %>%
    mutate(relfreq_sentences = n_sentences / sum(n_sentences),
           relfreq_words  = n_words / sum(n_words))


# subset datasets with proportion for each class
dat_combined_props <- dat_combined_props %>% 
    mutate(election_id = paste(countryname, date))

dat_combined_props_past <- filter(dat_combined_props, class == "Past")
dat_combined_props_present <- filter(dat_combined_props, class == "Present")
dat_combined_props_future <- filter(dat_combined_props, class == "Future")

dat_combined_props_past$incumbency_status2_factor <- factor(dat_combined_props_past$incumbency_status2_factor,
                                                            levels = c("Opposition", "Incumbent"))
dat_combined_props_present$incumbency_status2_factor <- factor(dat_combined_props_present$incumbency_status2_factor,
                                                               levels = c("Opposition", "Incumbent"))
dat_combined_props_future$incumbency_status2_factor <- factor(dat_combined_props_future$incumbency_status2_factor,
                                                              levels = c("Opposition", "Incumbent"))

dat_combined_props_present$extremist_party <- factor(dat_combined_props_present$extremist_party,
                                                     levels = c("Not extremist party", "Extremist party"))
dat_combined_props_future$extremist_party <- factor(dat_combined_props_future$extremist_party,
                                                    levels = c("Not extremist party", "Extremist party"))
dat_combined_props_past$extremist_party <- factor(dat_combined_props_past$extremist_party,
                                                  levels = c("Not extremist party", "Extremist party"))

# remove observations that do not contain information on GDP growth (lagged by one year)
dat_combined_props_past_nona <- filter(dat_combined_props_past, !is.na(gdp_growth_lag))
sd(dat_combined_props_past_nona$relfreq_sentences, na.rm = TRUE)

# run regression models with relative frequencies per class as dependent variable
lm_prop_past <- lm_robust(relfreq_sentences ~ 
                              incumbency_status2_factor + 
                              poly(rile, 2) + extremist_party + gdp_growth_lag +  year,
                          cluster = election_id,
                          fixed_effects = countryname, 
                          data = dat_combined_props_past)


# check standard deviation of emphasis on past
coef_past <- 0.05
sd_past <- sd(dat_combined_props_past_nona$relfreq_sentences, na.rm = TRUE)

coef_past / sd_past 

lm_prop_present <- update(lm_prop_past,
                          . ~ ., 
                          data = dat_combined_props_present)

lm_prop_future <- update(lm_prop_past,
                         . ~ ., 
                         data = dat_combined_props_future)


# Table A9 ----
screenreg(list(lm_prop_past,
               lm_prop_present,
               lm_prop_future))


texreg(list(lm_prop_past,
            lm_prop_present,
            lm_prop_future),
       custom.model.names = c("Model 1 (Past)",
                              "Model 2 (Present)",
                              "Model 3 (Future)"),
       custom.coef.names = c("Incumbent",
                             "RILE", 
                             "RILE$^2$",
                             "Extremist party",
                             "GDP growth",
                             "Year"),
       float.pos = "!h",
       label = "tab:reg_prop_main",
       fontsize = "footnotesize",
       caption.above =  TRUE,
       include.variance=FALSE,
       longtable=FALSE,
       use.packages=FALSE,
       custom.gof.names = c("R$^2$", "Adj. R$^2$",
                            "Number of observations",
                            "RMSE", "Number of clusters (Elections)"),
       file="taba09.tex",
       caption = "Predicting the emphasis on the past, present, and future in party manifestos",
       custom.note = ("\\parbox{0.85\\linewidth}{\\footnotesize \\vspace{2pt}%stars. \\\\
                      \\textit{Note}: 95\\% confidence intervals in parentheses. All models are
                      linear regressions with fixed effects for countries. 
                      Robust standard errors clustered by election.}"))


# calculate squared value of rile for calculation post hoc comparisons of interaction
dat_combined_props_past_emmeans <- dat_combined_props_past %>% 
    mutate(rile_squred = rile^2)

dat_combined_props_past_emmeans$incumbency_status2_factor <- factor(dat_combined_props_past_emmeans$incumbency_status2_factor,
                                                                    levels = c("Incumbent", "Opposition"))
dat_combined_props_past_emmeans$extremist_party <- factor(dat_combined_props_past_emmeans$extremist_party,
                                                          levels = c("Not extremist party", "Extremist party"))

lm_prop_past_emmeans <- lm_robust(relfreq_sentences ~ 
                                      incumbency_status2_factor * countryname +  rile + rile_squred +
                                      extremist_party + 
                                      gdp_growth_lag +  year,
                                  cluster = election_id,
                                  data = dat_combined_props_past_emmeans)


# Table A10 ----
screenreg(lm_prop_past_emmeans)

texreg(list(lm_prop_past_emmeans),
       omit.coef = c("(Intercept)"),
       custom.model.names = c("Model 1"),
       custom.coef.names = c("Opposition (ref.: Incumbent)",
                             "Austria (ref.: Australia)",
                             "Canada", 
                             "Germany", 
                             "Ireland", 
                             "New Zealand",
                             "Switzerland", 
                             "United Kingdom",
                             "United States", 
                             "RILE", 
                             "RILE$^2$",
                             "Extremist party",
                             "GDP growth",
                             "Year",
                             "Opposition $\\times$ Austria",
                             "Opposition $\\times$ Canada", 
                             "Opposition $\\times$ Germany", 
                             "Opposition $\\times$ Ireland", 
                             "Opposition $\\times$ New Zealand",
                             "Opposition $\\times$ Switzerland", 
                             "Opposition $\\times$ United Kingdom",
                             "Opposition $\\times$ United States"),
       float.pos = "!h",
       label = "tab:reg_prop_posthoc",
       fontsize = "footnotesize",
       caption.above =  TRUE,
       include.variance=FALSE,
       longtable=FALSE,
       use.packages=FALSE,
       custom.gof.names = c("R$^2$", "Adj. R$^2$",
                            "Number of observations",
                            "RMSE", "Number of clusters (Elections)"),
       file="taba10.tex",
       caption = "Predicting the emphasis on the past in party manifestos",
       custom.note = ("\\parbox{0.5\\linewidth}{\\footnotesize \\vspace{2pt}%stars. \\\\
                      \\textit{Note}: 95\\% confidence intervals in parentheses.
                      Robust standard errors clustered by election.}"))



# run two-sample t-tests for the emphasis on the past conditional on incumbency status
dat_combined_props_past_ttest <- dat_combined_props_past
dat_combined_props_past_ttest$incumbency_status2_factor <- factor(dat_combined_props_past_ttest$incumbency_status2_factor,
                                                                  levels = c("Incumbent", "Opposition"))
# t-test with 95% confidence intervals
ttests_past_95 <-  dat_combined_props_past_ttest %>% 
    group_by(countryname) %>% 
    do(tidy(t.test(relfreq_sentences ~ incumbency_status2_factor, data = .)))

# repeat for 90% confidence intervals
ttests_past_90 <-  dat_combined_props_past_ttest %>% 
    group_by(countryname) %>% 
    do(tidy(t.test(relfreq_sentences ~ incumbency_status2_factor, conf.level = 0.9, data = .))) %>% 
    select(conf.low90 = conf.low,
           conf.high90 = conf.high,
           countryname)

ttests_past <- left_join(ttests_past_95, ttests_past_90, by = "countryname")

# Figure A13a ----
ggplot(ttests_past, aes(x = countryname,
                        y = estimate, ymin = conf.low, ymax = conf.high)) +
    geom_point(size = 3) +
    geom_linerange(aes(ymin = conf.low,
                       ymax = conf.high),
                   size = 0.6) +
    geom_linerange(aes(ymin = conf.low90,
                       ymax = conf.high90), 
                   size = 1.3) +
    geom_hline(yintercept = 0, linetype = "dashed", colour = "red") +
    coord_flip() +
    labs(x = NULL, y = "Difference in means")
ggsave("fga13a.pdf", width = 5, height = 4)


# calculate pairwise post-hoc comparisons
# https://aosmith.rbind.io/2019/03/25/getting-started-with-emmeans/#confidence-intervals-for-comparisons
rg <- qdrg(object = lm_prop_past_emmeans, data = dat_combined_props_past_emmeans)

contrasts_countryname_inc <- emmeans(rg, pairwise ~  incumbency_status2_factor |countryname)$contrasts %>% 
    broom::tidy() 

# Figure A13b ----
ggplot(contrasts_countryname_inc, aes(x = countryname,
                                      y = estimate)) +
    geom_point(size = 3) +
    geom_linerange(aes(ymin = estimate - 1.96 * std.error,
                       ymax = estimate + 1.96 * std.error),
                   size = 0.6) +
    geom_linerange(aes(ymin = estimate - 1.645 * std.error,
                       ymax = estimate + 1.645 * std.error), 
                   size = 1.3) +
    geom_hline(yintercept = 0, linetype = "dashed", colour = "red") +
    coord_flip() +
    labs(x = NULL,
         y = "Pairwise post hoc comparisons")
ggsave("fga13b.pdf", width = 5, height = 4)



# get manifestos with fewer than three observations per class
# and add 0 for missing class (since proportion of this class is 0)

length(unique(dat_combined_props$manifesto_id))
nrow(dat_combined_props) /3

# filter manifestos with fewer than three classes (past is missing)
dat_combined_props_2 <- dat_combined_props %>% 
    group_by(manifesto_id) %>% 
    mutate(n_classes = n()) %>% 
    filter(n_classes < 3) %>%
    select(countryname:year) %>% 
    unique()

# change past information as 0 for these manifestos
dat_combined_props_2_past <- dat_combined_props_2 %>% 
    mutate(class = "Past",
           relfreq_sentences = 0,
           relfreq_words = 0,
           n_sentences = 0,
           n_words = 0)


nrow(dat_combined_props_2)

# bind the data frames (to have 3 observations per manifesto)
dat_combined_props <- bind_rows(dat_combined_props, 
                                dat_combined_props_2_past)


# Figure A14 ----

# create plot with proportions over time
ggplot(dat_combined_props, aes(x = year, 
                               y = relfreq_sentences,
                               colour = class,
                               linetype = class)) +
    geom_point(alpha = 0.3) +
    geom_smooth() +
    scale_colour_manual(values = c("black", "darkred", "darkgreen")) +
    facet_wrap(~countryname) +
    scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
    labs(x = "Year", y = "Percentage of temporal emphasis in party manifestos") +
    theme(legend.title = element_blank())
ggsave("fga14.pdf",
       width = 10, height = 8)


dat_combined_props$incumbency_status2_factor <- factor(
    dat_combined_props$incumbency_status2_factor,
    levels = c("Incumbent", "Opposition")
)


# new data frame for plots with party families
dat_combined_props_parfam <- dat_combined_props

dat_combined_props_parfam$class <- factor(dat_combined_props_parfam$class,
                                          levels = c("Past", "Present", "Future"))

# Figure A15 ----
ggplot(dat_combined_props_parfam, aes(x = party_family, 
                                      y = relfreq_sentences)) +
    geom_boxplot(position = position_dodge(width = 0.6),
                 width = 0.5, outlier.colour = "white") +
    ggbeeswarm::geom_quasirandom(position = position_dodge(width = 0.6),
                                 dodge.width = 0.6, alpha = 0.1, size = 1) +
    facet_wrap(~class, nrow = 1) +
    coord_flip() +
    scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
    labs(x = NULL, y = "Percentage of temporal emphasis in party manifestos") +
    theme(legend.title = element_blank(),
          legend.position = "bottom")
ggsave("fga15.pdf",
       width = 10, height = 4.5)


# calculate the proportion of future to past sentences (future) / (future + past)
dat_combined_props_past_future <- dat_combined %>% 
    group_by(countryname, partyname, party_family, language, date, incumbency_status2_factor,
             manifesto_id, extremist_party, gdp_growth_lag, 
             rile, year, class) %>%
    summarise(n_sentences = n(),
              n_words = sum(ntoken)) %>%
    spread(class, n_sentences) %>% 
    group_by(manifesto_id) %>% 
    mutate(prop_past_future = sum(Future, na.rm = TRUE) / (sum(Future, na.rm = TRUE) + sum(Past, na.rm = TRUE)))

dat_combined_props_past_future <- dat_combined_props_past_future %>% 
    select(countryname, partyname, party_family, language, date, incumbency_status2_factor,
             manifesto_id, extremist_party, gdp_growth_lag, 
             rile, year, prop_past_future) %>%
    unique() %>% 
    mutate(election_id = paste0(countryname, date))


dat_combined_props_past_future$incumbency_status2_factor <- factor(
    dat_combined_props_past_future$incumbency_status2_factor,
    levels = c("Incumbent", "Opposition")
)


## Figure A16 ----
ggplot(dat_combined_props_past_future, aes(x = incumbency_status2_factor, 
                               y = prop_past_future,
                               colour = incumbency_status2_factor)) +
    geom_boxplot(position = position_dodge(width = 0.6),
                 width = 0.5, outlier.colour = "white") +
    ggbeeswarm::geom_quasirandom(position = position_dodge(width = 0.6),
                                 dodge.width = 0.6, alpha = 0.2, size = 1) +
    scale_colour_manual(values = c("black", "red")) +
    facet_wrap(~countryname, scales = "free_x") +
    scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
    labs(x = NULL, y = "Future / (Future + Past)") +
    theme(legend.title = element_blank(),
          legend.position = "bottom")
ggsave("fga16.pdf",
       width = 10, height = 8)


# run regression models with proportion of past/present/future as dependent variable

dat_combined_props_past_future$extremist_party <- factor(dat_combined_props_past_future$extremist_party,
                                                         levels = c("Not extremist party", "Extremist party"))

lm_prop_past_future_base <- lm_robust(prop_past_future ~ 
                              incumbency_status2_factor + 
                              poly(rile, 2) + extremist_party +   year,
                          cluster = election_id,
                          fixed_effects = countryname, 
                          data = dat_combined_props_past_future)
      

lm_prop_past_future <- lm_robust(prop_past_future ~ 
                                     incumbency_status2_factor + 
                                     poly(rile, 2) + extremist_party + gdp_growth_lag +  year,
                                 cluster = election_id,
                                 fixed_effects = countryname, 
                                 data = dat_combined_props_past_future)


screenreg(list(lm_prop_past_future_base,
            lm_prop_past_future))

# Table A11 ----
texreg(list(lm_prop_past_future_base,
            lm_prop_past_future),
       custom.coef.names = c("Incumbent",
                             "RILE", 
                             "RILE$^2$",
                             "Extremist party",
                             "Year",
                             "GDP growth"),
       float.pos = "!h",
       label = "tab:reg_prop_emphasis_past_future",
       fontsize = "footnotesize",
       caption.above =  TRUE,
       include.variance=FALSE,
       longtable=FALSE,
       use.packages=FALSE,
       custom.gof.names = c("R$^2$", "Adj. R$^2$",
                            "Number of observations",
                            "RMSE", "Number of clusters (Elections)"),
       file="taba11.tex",
       caption = "Predicting the focus on retrospective (future) relative to purely retrospective (past) statements in party manifestos",
       custom.note = ("\\parbox{0.75\\linewidth}{\\footnotesize \\vspace{2pt}%stars. \\\\
                      \\textit{Note}: 95\\% confidence intervals in parentheses. 
                      The dependent variable is estimated as (future) / (future + past). 
                      All models are
                      linear regressions with fixed effects for countries. 
                      Robust standard errors clustered by election.}"))



# calculate class proportions using the number of words 
# and the number of sentences
dat_words_prop <- dat_combined %>% 
    group_by(language_capital, countryname, manifesto_id, class) %>% 
    summarise(sum_words_class = sum(ntoken, na.rm = TRUE)) %>% 
    group_by(manifesto_id) %>% 
    mutate(sum_words_manifesto = sum(sum_words_class)) %>% 
    mutate(relfreq_words_class = sum_words_class / sum_words_manifesto)

dat_sentences_prop <- dat_combined %>% 
    group_by(manifesto_id, class) %>% 
    summarise(sum_sentences_class = n()) %>% 
    group_by(manifesto_id) %>% 
    mutate(sum_sentences_manifesto = sum(sum_sentences_class)) %>% 
    mutate(relfreq_sentences_class = sum_sentences_class / sum_sentences_manifesto )

dat_words_sentences_prop <- left_join(dat_words_prop, 
                                      dat_sentences_prop)

# correlations per class and language
dat_words_sentences_prop <- dat_words_sentences_prop %>% 
    group_by(language_capital, class) %>% 
    mutate(cor = cor(relfreq_words_class, relfreq_sentences_class, use = "pairwise.complete")) %>% 
    mutate(country_class_cor = paste0(language_capital, ": ", class, " (r=", round(cor, 2), ")"))


# Figure A17 ----
ggplot(dat_words_sentences_prop, aes(x = relfreq_words_class,
                                     y = relfreq_sentences_class)) +
    geom_point(alpha  = 0.3) +
    facet_wrap(~country_class_cor) +
    labs(x = "Proportion of class with words as unit of analysis",
         y = "Proportion of class with (quasi-)sentences as unit of analysis")
ggsave("fga17.pdf",
       width = 10, height = 7)
